# Preliminary ----

if (!require(pacman)) install.packages("pacman") # install pacman to install any required packages

pacman::p_load(tidyverse, haven, Hmisc, conflicted)

conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")

basedir <- normalizePath(file.path(here::here("..")))

setwd(basedir)

BESIP_path <- "data/BESIP.rds"

# Clean British Election Study Internet Panel (BESIP) ----
## Import file ----

data <- readRDS(BESIP_path)

## Construct long dataset from wide BESIP file ----

# select columns of interest
data <- data %>%
  select(starts_with("conLookAfter"), starts_with("labLookAfter"), starts_with("snpLookAfter"), starts_with("ldLookAfter"), starts_with("infoSource"), id, starts_with("starttime"), starts_with("endtime"), starts_with("wave"))

# reshape the LookAfter variables
data_new <- setNames(data, gsub("NoEducated", "Uneducated", names(data)))

data_long1 <- data_new %>%
  select(id, starts_with(c("conLookAfter", "labLookAfter", "snpLookAfter", "ldLookAfter"))) %>%
  pivot_longer(
    cols = -id,
    names_to = c("party", "group", "wave"),
    values_to = "value",
    names_pattern = "(.*)(BAW|MCW|WCW|UnempW|MuslimsW|JewsW|ChristW|AtheistW|LocalW|WhiteW|LondonW|YoungW|RetiredW|EducatedW|UneducatedW|WomenW|MenW)(\\d+)",
    names_repair = "universal"
  )

data_long1$wave <- as.numeric(data_long1$wave)

# reshape the wave variables
data_long2 <- data %>%
  select(id, starts_with("wave")) %>%
  select(-"waves_taken") %>%
  pivot_longer(
    cols = -id,
    names_to = "wave",
    values_to = "value",
    names_prefix = "wave",
    names_transform = list(wave = as.integer)
  )

# filter out the rows for waves not taken ("value"==0)
data_long2 <- data_long2 %>%
  filter(value != 0)

data_long2 <- data_long2 %>%
  select(-value)

# validate
length(unique(data_long1$id))
length(unique(data_long2$id)) # good

# reshape starttime and endtime variables
data_long3a <- data %>%
  select(id, starts_with("starttimeW")) %>%
  pivot_longer(
    cols = -id,
    names_to = "wave",
    values_to = "starttime",
    names_prefix = "starttimeW",
    names_transform = list(wave = as.integer)
  )

data_long3b <- data %>%
  select(id, starts_with("endtimeW")) %>%
  pivot_longer(
    cols = -id,
    names_to = "wave",
    values_to = "endtime",
    names_prefix = "endtimeW",
    names_transform = list(wave = as.integer)
  )

data_long3 <- full_join(data_long3a, data_long3b, by = c("id", "wave"))
length(unique(data_long3$id)) # good

# reshape and recode the infoSource variables

data_long4 <- data %>%
  select(id, starts_with("infoSource")) %>%
  pivot_longer(
    cols = -id,
    names_to = c("infosource", "wave"),
    values_to = "newstime",
    names_pattern = "(infoSourceTVW|infoSourcePaperW|infoSourceRadioW|infoSourceInternetW|infoSourcePeopleW)(\\d+)",
    names_repair = "universal"
  ) %>%
  mutate(infosource = str_replace(infosource, "infoSource", "info"),
         infosource = str_replace(infosource, "W", ""),
         wave = as.numeric(wave)) %>%
  pivot_wider(
    names_from = infosource,
    values_from = newstime
  )

# set 'dont knows' to missing
data_long4 <- data_long4 %>%
  mutate(infoTV = ifelse(infoTV==9999,NA_real_,infoTV),
         infoPaper = ifelse(infoPaper==9999,NA_real_,infoPaper),
         infoRadio = ifelse(infoRadio==9999,NA_real_,infoRadio),
         infoInternet = ifelse(infoInternet==9999,NA_real_,infoInternet),
         infoPeople = ifelse(infoPeople==9999,NA_real_,infoPeople))

# format as numeric
data_long4 <- data_long4 %>%
  mutate(infoTV = as.numeric(infoTV),
         infoPaper = as.numeric(infoPaper),
         infoRadio = as.numeric(infoRadio),
         infoInternet = as.numeric(infoInternet),
         infoPeople = as.numeric(infoPeople))

# combine reshaped data
data_long <- full_join(data_long1, data_long2, by = c("id", "wave"))
data_long <- full_join(data_long, data_long3, by = c("id", "wave"))
data_long <- full_join(data_long, data_long4, by = c("id", "wave"))

## Clean long dataset ----
#remove non-existent iw's (merging creates lots of empty id-wave combos)
data_long_nona <- data_long |>
  filter(!is.na(starttime))

# create a date variable at the wave-level
data_long_nona$wavemonth <- data_long_nona$wave

date_labels <- c("March 2014", "June 2014", "October 2014", "March 2015", "April 2015", "May 2015", "April 2016", "June 2016", "July 2016", "December 2016", "May 2017", "June 2017", "June 2017", "May 2018", "March 2019", "June 2019", "November 2019", "December 2019", "December 2019", "June 2020", "May 2021", "December 2021", "May 2022", "December 2022")

data_long_nona$wavemonth <- factor(data_long_nona$wavemonth, labels = date_labels)

data_long_nona <- data_long_nona[, c("wave", "wavemonth", setdiff(names(data_long_nona), c("wave", "wavemonth")))]

data_long_nona <- data_long_nona |>
  select(id, wave, wavemonth, starttime, endtime, everything())

# clean up and recode
data_long_nona <- rename(data_long_nona, lookafter = value)
label(data_long_nona$lookafter) <- " "
label(data_long_nona$id) <- " "

data_long_nona <- data_long_nona %>%
  mutate(party = case_when(
    party == "labLookAfter" ~ "Labour",
    party == "conLookAfter" ~ "Conservatives",
    party == "ldLookAfter" ~ "Liberal Democrats",
    party == "snpLookAfter" ~ "SNP",
    TRUE ~ party
  ))

data_long_nona <- data_long_nona %>%
  mutate(group = case_when(
    group == "BAW" ~ "ethnic minorities",
    group == "MCW" ~ "middle-class people",
    group == "WCW" ~ "working class people",
    group == "UnempW" ~ "the unemployed",
    group == "LondonW" ~ "people from London",
    group == "YoungW" ~ "young people",
    group == "RetiredW" ~ "elderly people", 
    group == "WomenW" ~ "women",
    group == "MenW" ~ "men",
    group == "MuslimsW" ~ "Muslims",
    group == "JewsW" ~ "Jews",
    group == "ChristW" ~ "Christians",
    group == "AtheistW" ~ "atheists",
    TRUE ~ group
  ))

# let go of groups we can't identify group appeals for
data_long_nona <- data_long_nona %>%
  filter(!group %in% c("LocalW", "WhiteW", "EducatedW", "UneducatedW"))

saveRDS(data_long_nona, "data/BESIP_clean.rds")

# Clean BES Election Studies (BESES), rounds 1997-2019 ----

## Import files ----

## define file names
file_names <- c("BES1997.dta", "BES2001.dta", "BES2005.dta", "BES2010.dta", "BES2015.dta", "BES2017.dta", "BES2019.dta")

## import all files, naming them after their year ("yXXXX")

for (file_name in file_names) {
  
  # remove extension and prefix to create df name from file name
  df_name <- sub("\\.dta$", "", file_name)
  df_name <- sub("^BES", "orig_y", df_name)
  
  # read the data from the file and assign it to a data frame with the corresponding name
  assign(df_name, read_dta(paste0("data/",file_name)))
}

# select columns of interest (lookafter variables + date variables)
y1997 <- orig_y1997 %>%
  select(starts_with("conint"), starts_with("labint"), "enddate1")
# wc working class, mc middle class, un unemployed, bb big business, tu trade unions, bk black asians, be people on benefits, ri very rich
# iw time: enddate1 is in ddmyyyy format

y2001 <- orig_y2001 %>%
  select("cq9a", "cq9b", "cq9c", "cq9d", "cq9e", "cq9f", "cq9g", "cq9h", "cq10a", "cq10b", "cq10c", "cq10d", "cq10e", "cq10f", "cq10g", "cq10h", "bstart", "bend")
# 9 is labour; 10 is conservative
# a working class, b middle class, c unemployed, d big business, e trade unions, f blacks asians, g women, h pensioners
# iw time: bstart and bend are already in "Date" format

y2005 <- orig_y2005 %>%
  select("cq2a", "cq2b", "cq2c", "cq2d", "cq2e", "cq2f", "cq2g", "cq2h", "cq3a", "cq3b", "cq3c", "cq3d", "cq3e", "cq3f", "cq3g", "cq3h", "startda2")
# 2 is labour; 3 is conservative
# a working class, b middle class, c unemployed, d big business, e trade unions, f blacks asians, g women, h pensioners
# iw time: startda2 is in ddmmyyyy format

y2010 <- orig_y2010 %>%
  select(starts_with("cq2_"), starts_with("cq3_"), starts_with("cq4_"), "bdate")
# cq2 is lab, cq3 is con, cq4 is lib dem
# 1 working class, 2 middle class, 3 unemployed, 4 big business, 5 trade unions, 6 blacks asians, 7 women, 8 pensioners
# iw time: bdate is formatted as "yyyymmdd" so eg "20100528" - needs a little work

y2015 <- orig_y2015 %>%
  select(starts_with("b12"), starts_with("b13"), "Int_fin_time")
# b12 is con, b13 is lab
# 1 BAME, 2 trade unions, 3 middle c, 4 big bizz, 5 working c, 6 unemp
# iw time: Int_fin_time is already in date format - can just do a as.Date(y2015$Int_fin_time) to get it to ordinary "Date" format

y2017 <- orig_y2017 %>%
  select(starts_with("b12"), starts_with("b13"), "Interview_Date")
# b12 is con, b13 is lab
# 1 BAME, 2 trade unions, 3 middle c, 4 big bizz, 5 working c, 6 unemp
# iw time: Interview_Date is in character format dd/mm/yyyy - can just do a as.Date(y2017$Interview_Date, format = "%d/%m/%Y") to get it to ordinary "Date" format

y2019 <- orig_y2019 %>%
  select(starts_with("b12"), starts_with("b13"), "Interview_Date")
# b12 is con, b13 is lab
# 1 BAME, 2 trade unions, 3 middle c, 4 big bizz, 5 working c, 6 unemp
# iw time: Interview_Date is already in date format - can just do a as.Date(y2019$Interview_Date) to get it to ordinary "Date" format

## Harmonize variables ----

# custom functions for renaming lookafter variables
rename_variables_1997 <- function(df) {
  names(df) <- gsub("labint", "labLookAfter\\1", names(df))
  names(df) <- gsub("conint", "conLookAfter\\1", names(df))
  
  names(df) <- gsub("wc", "WCW", names(df))
  names(df) <- gsub("mc", "MCW", names(df))
  names(df) <- gsub("un", "UnempW", names(df))
  names(df) <- gsub("bb", "BizW", names(df))
  names(df) <- gsub("tu", "UnionsW", names(df))
  names(df) <- gsub("bk", "BAW", names(df))
  names(df) <- gsub("be", "BenefitsW", names(df))
  names(df) <- gsub("ri", "RichW", names(df))
  
  return(df)
}

rename_variables_2001 <- function(df) {
  names(df) <- gsub("cq9([a-h])$", "labLookAfter\\1", names(df))
  names(df) <- gsub("cq10([a-h])$", "conLookAfter\\1", names(df))
  
  names(df) <- gsub("Aftera$", "AfterWCW", names(df))
  names(df) <- gsub("Afterb$", "AfterMCW", names(df))
  names(df) <- gsub("Afterc$", "AfterUnempW", names(df))
  names(df) <- gsub("Afterd$", "AfterBizW", names(df))
  names(df) <- gsub("Aftere$", "AfterUnionsW", names(df))
  names(df) <- gsub("Afterf$", "AfterBAW", names(df))
  names(df) <- gsub("Afterg$", "AfterWomenW", names(df))
  names(df) <- gsub("Afterh$", "AfterRetiredW", names(df))
  
  return(df)
}

rename_variables_2005 <- function(df) {
  names(df) <- gsub("cq2([a-h])$", "labLookAfter\\1", names(df))
  names(df) <- gsub("cq3([a-h])$", "conLookAfter\\1", names(df))
  
  names(df) <- gsub("Aftera$", "AfterWCW", names(df))
  names(df) <- gsub("Afterb$", "AfterMCW", names(df))
  names(df) <- gsub("Afterc$", "AfterUnempW", names(df))
  names(df) <- gsub("Afterd$", "AfterBizW", names(df))
  names(df) <- gsub("Aftere$", "AfterUnionsW", names(df))
  names(df) <- gsub("Afterf$", "AfterBAW", names(df))
  names(df) <- gsub("Afterg$", "AfterWomenW", names(df))
  names(df) <- gsub("Afterh$", "AfterRetiredW", names(df))
  
  return(df)
}

rename_variables_2010 <- function(df) {
  names(df) <- gsub("cq2", "labLookAfter\\1", names(df))
  names(df) <- gsub("cq3", "conLookAfter\\1", names(df))
  names(df) <- gsub("cq4", "ldLookAfter\\1", names(df))
  
  names(df) <- gsub("_1", "WCW", names(df))
  names(df) <- gsub("_2", "MCW", names(df))
  names(df) <- gsub("_3", "UnempW", names(df))
  names(df) <- gsub("_4", "BizW", names(df))
  names(df) <- gsub("_5", "UnionsW", names(df))
  names(df) <- gsub("_6", "BAW", names(df))
  names(df) <- gsub("_7", "WomenW", names(df))
  names(df) <- gsub("_8", "RetiredW", names(df))
  
  return(df)
}

rename_variables_2015_17_19 <- function(df) {
  names(df) <- gsub("b13", "labLookAfter\\1", names(df))
  names(df) <- gsub("b12", "conLookAfter\\1", names(df))
  
  names(df) <- gsub("_1", "BAW", names(df))
  names(df) <- gsub("_2", "UnionsW", names(df))
  names(df) <- gsub("_3", "MCW", names(df))
  names(df) <- gsub("_4", "BizW", names(df))
  names(df) <- gsub("_5", "WCW", names(df))
  names(df) <- gsub("_6", "UnempW", names(df))
  
  return(df)
}

# apply the functions to each dataframe
y1997 <- rename_variables_1997(y1997)
y2001 <- rename_variables_2001(y2001)
y2005 <- rename_variables_2005(y2005)
y2010 <- rename_variables_2010(y2010)
y2015 <- rename_variables_2015_17_19(y2015)
y2017 <- rename_variables_2015_17_19(y2017)
y2019 <- rename_variables_2015_17_19(y2019)

# lookafter variable labels
table(y1997$labLookAfterBAW)
table(y2001$labLookAfterBAW)
table(y2005$labLookAfterBAW)
table(y2010$labLookAfterBAW)
table(y2015$labLookAfterBAW)
table(y2017$labLookAfterBAW)
table(y2019$labLookAfterBAW)
# types of missings: -3, -1, 8, 9, 99, 999, -999

# they all have the same key values: 1 is very closely, 2 is fairly closely, 3 is not very closely, and 4 is not at all closely

# renaming and formatting date variables

# y1997
y1997$enddate1 <- as.character(y1997$enddate1)
y1997$enddate1[nchar(y1997$enddate1) == 7] <- paste0("0", y1997$enddate1[nchar(y1997$enddate1) == 7]) # add leading zeroes to single-digit dates (so they're all ddmmyyyy)
y1997$enddate1_formatted <- gsub("(..)(..)(....)", "\\1/\\2/\\3", y1997$enddate1)
y1997$date <- as.Date(y1997$enddate1_formatted, format = "%d/%m/%Y")
summary(y1997$date) # validating

y1997 <- y1997 |>
  select(-enddate1, -enddate1_formatted)

# y2001
summary(y2001$bend)
summary(y2001$bstart)
y2001$date <- y2001$bstart

names(y2001)
y2001 <- y2001 |>
  select(-bstart, -bend)

# y2005
table(y2005$startda2) # is in ddmmyyyy
y2005$startda2 <- as.Date(y2005$startda2, format = "%d%m%Y")
year <- year(y2005$startda2)
y2005$startda2[year != 2005] <- NA
summary(y2005$startda2) # validating
y2005$date <- y2005$startda2

names(y2005)
y2005 <- y2005 |>
  select(-startda2)

# y2010
table(y2010$bdate) # is in yyyymmdd
y2010$bdate <- as.Date(as.character(y2010$bdate), format = "%Y%m%d")
summary(y2010$bdate) # validating
y2010$date <- y2010$bdate

names(y2010)
y2010 <- y2010 |>
  select(-bdate)

# y2015
y2015$date <- as.Date(y2015$Int_fin_time)
summary(y2015$date)
year <- year(y2015$date)
y2015$date[year != 2015] <- NA

names(y2015)
y2015 <- y2015 |>
  select(-Int_fin_time)

# y2017
table(y2017$Interview_Date)
y2017$date <- as.Date(y2017$Interview_Date, format = "%d/%m/%Y")
summary(y2017$date)

names(y2017)
y2017 <- y2017 |>
  select(-Interview_Date)

# y2019
summary(y2019$Interview_Date)
y2019$date <- as.Date(y2019$Interview_Date)
summary(y2019$date)

names(y2019)
y2019 <- y2019 |>
  select(-Interview_Date)

# add BES indicators
y1997$BES <- 1997
y2001$BES <- 2001
y2005$BES <- 2005
y2010$BES <- 2010
y2015$BES <- 2015
y2017$BES <- 2017
y2019$BES <- 2019

# merge
fulldf <- bind_rows(y1997, y2001, y2005, y2010, y2015, y2017, y2019) %>%
  mutate(id = row_number()) %>% # 'fake' id variable
  select(id, BES, date, everything()) %>%
  arrange(BES, date)

# format lookafter variables so they're all the same:
lookAfter_vars <- grep("LookAfter", names(fulldf), value = TRUE)

missings <- c(-3, -1, 8, 9, 99, 999, -999)

fulldf <- fulldf %>%
  mutate(across(all_of(lookAfter_vars), ~ ifelse(. %in% missings, 9999, as.numeric(.))))

## Construct long dataset ----
# reshape the LookAfter variables
fulldf_long <- fulldf %>%
  select(id, BES, date, starts_with(c("conLookAfter", "labLookAfter", "ldLookAfter"))) %>%
  pivot_longer(
    cols = -c(id, BES, date),
    names_to = c("party", "group"),
    values_to = "value",
    names_pattern = "(.*)(BAW|MCW|WCW|UnempW|RetiredW|WomenW|UnionsW|RichW|BenefitsW|BizW)",
    names_repair = "universal"
  )

## Clean long dataset ----

# flip the lookafter values to match coding in BESIP (1 is supposed to be low concern and 4 supposed to be high)
fulldf_long <- fulldf_long %>%
  mutate(value = recode(value, `1` = 4, `2` = 3, `3` = 2, `4` = 1))

# exclude individuals with all 'value' as NA or 9999
fulldf_long <- fulldf_long %>%
  group_by(id) %>%
  filter(!all(is.na(value) | value == 9999)) %>%
  ungroup()

# remove NA rows (these are 'artificial' survey items from items not asked in certain waves etc.)
nrow(fulldf_long)
fulldf_long_nona <- fulldf_long %>%
  filter(!is.na(value))
nrow(fulldf_long_nona)

# let go of groups we're not using (which we can't/don't identify group appeals for)
table(fulldf_long_nona$group)
fulldf_long_nona <- fulldf_long_nona %>%
  filter(!group %in% c("BenefitsW", "BizW", "RichW", "UnionsW"))

# clean up and recode
fulldf_long_nona <- rename(fulldf_long_nona, lookafter = value)
label(fulldf_long_nona$lookafter) <- " "
label(fulldf_long_nona$id) <- " "

fulldf_long_nona <- fulldf_long_nona %>%
  mutate(party = case_when(
    party == "labLookAfter" ~ "Labour",
    party == "conLookAfter" ~ "Conservatives",
    party == "ldLookAfter" ~ "Liberal Democrats",
    TRUE ~ party
  ))

fulldf_long_nona <- fulldf_long_nona %>%
  mutate(group = case_when(
    group == "BAW" ~ "ethnic minorities",
    group == "MCW" ~ "middle-class people",
    group == "WCW" ~ "working class people",
    group == "UnempW" ~ "the unemployed",
    group == "RetiredW" ~ "elderly people", 
    group == "WomenW" ~ "women",
    TRUE ~ group
  ))

saveRDS(fulldf_long_nona, "data/BESrounds.rds")
